suppressPackageStartupMessages({
library(SummarizedExperiment)
library(SEtools)
library(ggplot2)
library(plgINS)
library(edgeR)
})#' exonDEA
#'
#' Generates control samples out of the average values over all supplied samples, combines spliced and unspliced
#' assays, calculates their logCPM & log2FC data and performs DEA over all treatments & individual ones.
#' All is assembled into one SE object.
#'
#'
#' @param se SummarizedExperiment object containing assays of raw counts of spliced & unspliced tx
#'
#' @return an SE object of combined spliced and unspliced data
#'
exonDEA <- function(se, model, model0=~1){
# allocation
e1 <- assays(se)$spliced
e2 <- assays(se)$unspliced
readtype1 <- "spliced"
readtype2 <- "unspliced"
# generate control
## create logcpm assays for both spliced & unspliced assays
assays(se)$logcpm.spliced <- log1p(cpm(calcNormFactors(DGEList(assays(se)$spliced))))
assays(se)$logcpm.unspliced <- log1p(cpm(calcNormFactors(DGEList(assays(se)$unspliced))))
## generate a 'control' sample out of the median normalized counts over all samples
genCTRL <- function(se, logcpm){
e.ctrl <- sapply(unique(se$Batch), ls=min(colSums(assay(se))),
FUN=function(x,ls){
rowMedians(exp(assays(se)[[logcpm]][,se$Batch==x])-1) * ls/1000000
}
)
### generate colnames
n.cols <- sapply( unique(se$Batch), FUN=function(x)
var_name <- paste("CTRL", x, sep=".") )
colnames(e.ctrl) <- n.cols
rownames(e.ctrl) <- rownames(se)
return(e.ctrl)
}
e.ctrl1 <- genCTRL(se, "logcpm.spliced")
e.ctrl2 <- genCTRL(se, "logcpm.unspliced")
## add control samples to assays & generate DGEList object
dds1 <- DGEList(cbind(e1, e.ctrl1))
dds2 <- DGEList(cbind(e2, e.ctrl2))
## combine the spliced & unspliced assays
dds <- cbind(dds1, dds2)
# generate colData
## generate colData info for combined assay
dd1 <- colData(se)[,c("Batch","miRNA","Cell_Line")]
dd.ctrl1 <- data.frame(Batch=unique(se$Batch),
miRNA="CTRL",
Cell_Line=unique(as.character(dd1$Cell_Line)) )
dd1 <- rbind(dd1, dd.ctrl1)
dd1$readtype <- readtype1
## duplicate dd to have data for combined spliced & unspliced assay
dd <- rbind(dd1, dd1)
dd$readtype[(nrow(dd1)+1):nrow(dd)] <- readtype2
dd$readtype <- as.factor(dd$readtype)
## rename both dds & dd object features
n.cols1 <- sapply(colnames(dds1), FUN=function(x)
var_name <- paste(x, readtype1, sep=".") )
n.cols2 <- sapply(colnames(dds2), FUN=function(x)
var_name <- paste(x, readtype2, sep=".") )
colnames(dds) <- c(n.cols1, n.cols2)
rownames(dd) <- c(n.cols1, n.cols2)
# DEA
dd$miRNA <- relevel(droplevels(dd$miRNA), ref="CTRL")
dd$readtype <- relevel(dd$readtype, ref="unspliced")
dd$Batch <- droplevels(dd$Batch)
## To do the normalization
dds <- calcNormFactors(dds)
## models
mm <- model.matrix(model, data=dd)
mm0 <- model.matrix(model0, data=dd)
testCoeff <- setdiff(colnames(mm), colnames(mm0))
## estimate dispersion
dds <- estimateDisp(dds,mm)
## fit negative binomial distribution on counts per gene (use glmFit for few replicates)
fit <- glmFit(dds, mm)
## fit a GLM on the data
lrt.comb <- glmLRT(fit, testCoeff)
## top genes that change relative to stage 0
res.comb <- as.data.frame(topTags(lrt.comb, Inf))
## fit linear model dropping one sample at a time (using multiple cores)
res.list <- bplapply( testCoeff, BPPARAM=MulticoreParam(8), FUN=function(x){
as.data.frame(topTags(glmLRT(fit, x),Inf))
})
dea.names <- gsub("readtype","", testCoeff)
dea.names <- make.names(gsub(":miRNA",".", dea.names))
colnames(res.comb)[grepl("logFC",colnames(res.comb))] <- paste0("logFC.", dea.names)
names(res.list) <- paste0("DEA.",dea.names)
# generate final SE object
## SE object with logCPM & logFC assays
se <- SummarizedExperiment(assays=list(counts=dds$counts),
rowData=rowData(se),
colData=dd)
assays(se)$logcpm <- log1p(cpm(calcNormFactors(DGEList(assay(se)))))
se <- SEtools::log2FC(se, controls = se$miRNA=="CTRL", by = paste(se$Batch, se$readtype), fromAssay = "logcpm")
## add DEAs
rowData(se)$DEA.spliced.all <- DataFrame(res.comb[rownames(se),])
for(i in paste0("DEA.",dea.names)){
rowData(se)[[i]] <- DataFrame(res.list[[i]][rownames(se),])
}
# output
return(se)
}# bartel hela & hek DEA with exon resolution
## combined readtype:miRNA effect
se.hela <- exonDEA(se.splicing[,se.splicing$Cell_Line=="HeLa"],
model = ~readtype*miRNA, model0 = ~readtype+miRNA)
se.hek <- exonDEA(se.splicing[,se.splicing$Cell_Line=="HEK293FT"],
model = ~readtype*miRNA, model0 = ~readtype+miRNA)
# miRNA effect
se.hela.test <- exonDEA(se.splicing[,se.splicing$Cell_Line=="HeLa"],
model = ~readtype+miRNA, model0 = ~readtype)
se.hek.test <- exonDEA(se.splicing[,se.splicing$Cell_Line=="HEK293FT"],
model = ~readtype+miRNA, model0 = ~readtype)
# miRNA & combined exon:miRNA effect
se.hela.test2 <- exonDEA(se.splicing[,se.splicing$Cell_Line=="HeLa"],
model = ~readtype*miRNA, model0 = ~readtype)
se.hek.test2 <- exonDEA(se.splicing[,se.splicing$Cell_Line=="HEK293FT"],
model = ~readtype*miRNA, model0 = ~readtype)
# batch effect included
se.hela.batch <- exonDEA(se.splicing[,se.splicing$Cell_Line=="HeLa"],
model = ~Batch+readtype*miRNA, model0 = ~Batch+readtype+miRNA)
se.hek.batch <- exonDEA(se.splicing[,se.splicing$Cell_Line=="HEK293FT"],
model = ~Batch+readtype*miRNA, model0 = ~Batch+readtype+miRNA)# logFC heatmap HeLa [model = ~readtype*miRNA, model0 = ~readtype+miRNA]
SEtools::sehm(se.hela[,order(colData(se.hela)$miRNA)], row.names(se.hela)[rowData(se.hela)$DEA.spliced.all$FDR < 0.01], gaps_at = "readtype",
breaks=T, do.scale = T, show_colnames = FALSE, assayName = "log2FC", anno_columns = c("miRNA","readtype"))# logCPM heatmap HeLa spliced [model = ~readtype*miRNA, model0 = ~readtype+miRNA]
se.hela.sp <- se.hela[,colData(se.hela)$readtype=="spliced"]
SEtools::sehm(se.hela.sp[,order(colData(se.hela.sp)$miRNA)], row.names(se.hela.sp)[rowData(se.hela.sp)$DEA.spliced.all$FDR < 0.01], gaps_at = "readtype",
breaks=T, do.scale = T, show_colnames = FALSE, assayName = "logcpm", anno_columns = c("miRNA","readtype"))# batch logFC heatmap HeLa [model = ~Batch+readtype*miRNA, model0 = ~Batch+readtype+miRNA]
SEtools::sehm(se.hela.batch[,order(colData(se.hela.batch)$miRNA)], row.names(se.hela.batch)[rowData(se.hela.batch)$DEA.spliced.all$FDR < 0.01], gaps_at = "readtype",
breaks=T, do.scale = T, show_colnames = FALSE, assayName = "log2FC", anno_columns = c("miRNA","readtype"))# logFC heatmap HeLa [model = ~readtype+miRNA, model0 = ~readtype]
SEtools::sehm(se.hela.test[,order(colData(se.hela.test)$miRNA)], row.names(se.hela.test)[rowData(se.hela.test)$DEA.spliced.all$FDR < 0.00001], gaps_at = "readtype",
breaks=T, do.scale = T, show_colnames = FALSE, assayName = "log2FC", anno_columns = c("miRNA","readtype"))# logFC heatmap HeLa [model = ~readtype*miRNA, model0 = ~readtype]
SEtools::sehm(se.hela.test2[,order(colData(se.hela.test2)$miRNA)], row.names(se.hela.test2)[rowData(se.hela.test2)$DEA.spliced.all$FDR < 0.00001], gaps_at = "readtype",
breaks=T, do.scale = T, show_colnames = FALSE, assayName = "log2FC", anno_columns = c("miRNA","readtype"))# logFC heatmap HEK [model = ~readtype*miRNA, model0 = ~readtype+miRNA]
SEtools::sehm(se.hek[,order(colData(se.hek)$miRNA)], row.names(se.hek)[rowData(se.hek)$DEA.spliced.all$FDR < 0.01], gaps_at = "readtype",
breaks=T, do.scale = T, show_colnames = FALSE, assayName = "log2FC", anno_columns = c("miRNA","readtype"))# logCPM heatmap HEK spliced [model = ~readtype*miRNA, model0 = ~readtype+miRNA]
se.hek.sp <- se.hek[,colData(se.hek)$readtype=="spliced"]
SEtools::sehm(se.hek.sp[,order(colData(se.hek.sp)$miRNA)], row.names(se.hek.sp)[rowData(se.hek.sp)$DEA.spliced.all$FDR < 0.01], gaps_at = "readtype",
breaks=T, do.scale = T, show_colnames = FALSE, assayName = "logcpm", anno_columns = c("miRNA","readtype"))# batch logFC heatmap HEK [model = ~Batch+readtype*miRNA, model0 = ~Batch+readtype+miRNA]
SEtools::sehm(se.hek.batch[,order(colData(se.hek.batch)$miRNA)], row.names(se.hek.batch)[rowData(se.hek.batch)$DEA.spliced.all$FDR < 0.01], gaps_at = "readtype",
breaks=T, do.scale = T, show_colnames = FALSE, assayName = "log2FC", anno_columns = c("miRNA","readtype"))# logFC heatmap HEK [model = ~readtype+miRNA, model0 = ~readtype]
SEtools::sehm(se.hek.test[,order(colData(se.hek.test)$miRNA)], row.names(se.hek.test)[rowData(se.hek.test)$DEA.spliced.all$FDR < 0.00001], gaps_at = "readtype",
breaks=T, do.scale = T, show_colnames = FALSE, assayName = "log2FC", anno_columns = c("miRNA","readtype"))# logFC heatmap HEK [model = ~readtype*miRNA, model0 = ~readtype]
SEtools::sehm(se.hek.test2[,order(colData(se.hek.test2)$miRNA)], row.names(se.hek.test2)[rowData(se.hek.test2)$DEA.spliced.all$FDR < 0.00001], gaps_at = "readtype",
breaks=T, do.scale = T, show_colnames = FALSE, assayName = "log2FC", anno_columns = c("miRNA","readtype"))# spliced v unspliced HeLa
lib.hela.spliced <- apply(assays(se.hela[,colData(se.hela)$readtype=="spliced"])$counts, 2, function(x) sum(x))
lib.hela.unspliced <- apply(assays(se.hela[,colData(se.hela)$readtype=="unspliced"])$counts, 2, function(x) sum(x))
median(lib.hela.unspliced / lib.hela.spliced)## [1] 0.2249225
# spliced v unspliced HEK
lib.hek.spliced <- apply(assays(se.hek[,colData(se.hek)$readtype=="spliced"])$counts, 2, function(x) sum(x))
lib.hek.unspliced <- apply(assays(se.hek[,colData(se.hek)$readtype=="unspliced"])$counts, 2, function(x) sum(x))
median(lib.hek.unspliced / lib.hek.spliced)## [1] 0.2012097